/**************************************************************
 * * Copyright (c) 2019-2020 Huawei Technologies Co., Ltd.
 *
 * * License under BSD 3-Clause "New" or "Revised" License
 *
 **************************************************************/

.text

.global gf_4vect_dot_prod_neon
.type gf_4vect_dot_prod_neon, %function


/* arguments */
x_len		.req	x0
x_vec		.req	x1
x_tbl		.req	x2
x_src		.req	x3
x_dest		.req	x4

/* returns */
w_ret		.req	w0

/* local variables */
x_vec_i		.req	x5
x_ptr		.req	x6
x_pos		.req	x7
x_tmp		.req	x8
x_dest1		.req	x9
x_tbl1		.req	x10
x_dest2		.req	x11
x_tbl2		.req	x12
x_dest3		.req	x13
x_tbl3		.req	x14
x_dest4		.req	x_dest
x_tbl4		.req	x15

/* vectors */
v_mask0f	.req	v0
q_mask0f	.req	q0
v_tmp1_lo	.req	v1
v_tmp1_hi	.req	v2
v_tmp1		.req	v3
q_tmp1		.req	q3

v_p1_0		.req	v4
v_p2_0		.req	v5
v_p3_0		.req	v6
v_p4_0		.req	v7

q_p1_0		.req	q4
q_p2_0		.req	q5
q_p3_0		.req	q6
q_p4_0		.req	q7

v_data_0	.req	v8
v_data_1	.req	v9
v_data_2	.req	v10
v_data_3	.req	v11
q_data_0	.req	q8
q_data_1	.req	q9
q_data_2	.req	q10
q_data_3	.req	q11

v_p1_3		.req	v12
v_p2_3		.req	v13
v_p3_3		.req	v14
v_p4_3		.req	v15
q_p1_3		.req	q12
q_p2_3		.req	q13
q_p3_3		.req	q14
q_p4_3		.req	q15

v_gft1_lo	.req	v16
v_gft1_hi	.req	v17
v_gft2_lo	.req	v18
v_gft2_hi	.req	v19
v_gft3_lo	.req	v20
v_gft3_hi	.req	v21
v_gft4_lo	.req	v22
v_gft4_hi	.req	v23
q_gft1_lo	.req	q16
q_gft1_hi	.req	q17
q_gft2_lo	.req	q18
q_gft2_hi	.req	q19
q_gft3_lo	.req	q20
q_gft3_hi	.req	q21
q_gft4_lo	.req	q22
q_gft4_hi	.req	q23

v_p1_1		.req	v24
v_p1_2		.req	v25
v_p2_1		.req	v26
v_p2_2		.req	v27
v_p3_1		.req	v28
v_p3_2		.req	v29
v_p4_1		.req	v30
v_p4_2		.req	v31

q_p1_1		.req	q24
q_p1_2		.req	q25
q_p2_1		.req	q26
q_p2_2		.req	q27
q_p3_1		.req	q28
q_p3_2		.req	q29
q_p4_1		.req	q30
q_p4_2		.req	q31

v_data		.req	v_tmp1
q_data		.req	q_tmp1
v_data_lo	.req	v_tmp1_lo
v_data_hi	.req	v_tmp1_hi

.align 16
gf_4vect_dot_prod_neon:
	/* less than 16 bytes, return_fail */
	cmp	x_len, #16
	blt	.return_fail

	movi	v_mask0f.16b, #0x0f
	mov	x_pos, #0
	lsl	x_vec, x_vec, #3
	ldr	x_dest1, [x_dest, #8*0]
	ldr	x_dest2, [x_dest, #8*1]
	ldr	x_dest3, [x_dest, #8*2]
	ldr	x_dest4, [x_dest, #8*3]

.align 4
.Lloop64_init:
	/* less than 64 bytes, goto Lloop16_init */
	cmp	x_len, #64
	blt	.Lloop16_init

	/* save d8 ~ d15 to stack */
	sub	sp, sp, #64
	stp	d8, d9, [sp]
	stp	d10, d11, [sp, #16]
	stp	d12, d13, [sp, #32]
	stp	d14, d15, [sp, #48]

	sub	x_len, x_len, #64

.align 4
.Lloop64:
	movi	v_p1_0.16b, #0
	movi	v_p1_1.16b, #0
	movi	v_p1_2.16b, #0
	movi	v_p1_3.16b, #0
	movi	v_p2_0.16b, #0
	movi	v_p2_1.16b, #0
	movi	v_p2_2.16b, #0
	movi	v_p2_3.16b, #0
	movi	v_p3_0.16b, #0
	movi	v_p3_1.16b, #0
	movi	v_p3_2.16b, #0
	movi	v_p3_3.16b, #0
	movi	v_p4_0.16b, #0
	movi	v_p4_1.16b, #0
	movi	v_p4_2.16b, #0
	movi	v_p4_3.16b, #0

	mov	x_tbl1, x_tbl
	add	x_tbl2, x_tbl1, x_vec, lsl #2
	add	x_tbl3, x_tbl2, x_vec, lsl #2
	add	x_tbl4, x_tbl3, x_vec, lsl #2
	mov	x_vec_i, #0
	prfm	pldl1keep, [x_tbl1]
	prfm	pldl1keep, [x_tbl2]
	prfm	pldl1keep, [x_tbl3]
	prfm	pldl1keep, [x_tbl4]

.align 4
.Lloop64_vects:
	ldr	x_ptr, [x_src, x_vec_i]
	add	x_vec_i, x_vec_i, #8
	add	x_ptr, x_ptr, x_pos

	ldr	q_data_0, [x_ptr], #16
	ldr	q_data_1, [x_ptr], #16
	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
	ldp	q_gft3_lo, q_gft3_hi, [x_tbl3], #32
	ldp	q_gft4_lo, q_gft4_hi, [x_tbl4], #32
	ldr	q_data_2, [x_ptr], #16
	ldr	q_data_3, [x_ptr], #16

	prfm	pldl1strm, [x_ptr]
	prfm	pldl1keep, [x_tbl1]
	prfm	pldl1keep, [x_tbl2]
	prfm	pldl1keep, [x_tbl3]
	prfm	pldl1keep, [x_tbl4]

	/* data_0 */
	and	v_tmp1.16b, v_data_0.16b, v_mask0f.16b
	ushr	v_data_0.16b, v_data_0.16b, #4

	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b

	/* data_1 */
	and	v_tmp1.16b, v_data_1.16b, v_mask0f.16b
	ushr	v_data_1.16b, v_data_1.16b, #4

	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b

	/* data_2 */
	and	v_tmp1.16b, v_data_2.16b, v_mask0f.16b
	ushr	v_data_2.16b, v_data_2.16b, #4

	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b

	/* data_3 */
	and	v_tmp1.16b, v_data_3.16b, v_mask0f.16b
	ushr	v_data_3.16b, v_data_3.16b, #4

	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b

	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
	eor	v_tmp1_hi.16b, v_tmp1_lo.16b, v_tmp1_hi.16b
	eor	v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b

	cmp	x_vec_i, x_vec
	blt	.Lloop64_vects

.align 4
.Lloop64_vects_end:
	add	x_ptr, x_dest1, x_pos
	stp q_p1_0, q_p1_1, [x_ptr], #32
	stp q_p1_2, q_p1_3, [x_ptr]
	
	add	x_ptr, x_dest2, x_pos
	stp q_p2_0, q_p2_1, [x_ptr], #32
	stp q_p2_2, q_p2_3, [x_ptr]

	add	x_ptr, x_dest3, x_pos
	stp q_p3_0, q_p3_1, [x_ptr], #32
	stp q_p3_2, q_p3_3, [x_ptr]

	add	x_ptr, x_dest4, x_pos
	stp q_p4_0, q_p4_1, [x_ptr], #32
	stp q_p4_2, q_p4_3, [x_ptr]

	add	x_pos, x_pos, #64
	cmp	x_pos, x_len
	ble	.Lloop64

.align 4
.Lloop64_end:
	/* restore d8 ~ d15 */
	ldp	d8,  d9,  [sp]
	ldp	d10, d11, [sp, #16]
	ldp	d12, d13, [sp, #32]
	ldp	d14, d15, [sp, #48]
	add	sp, sp, #64

	add	x_len, x_len, #64
	cmp	x_pos, x_len
	beq	.return_pass

.align 4
.Lloop16_init:
	sub	x_len, x_len, #16
	cmp	x_pos, x_len
	bgt	.lessthan16_init

.align 4
.Lloop16:
	movi	v_p1_0.16b, #0
	movi	v_p2_0.16b, #0
	movi	v_p3_0.16b, #0
	movi	v_p4_0.16b, #0
	mov	x_tbl1, x_tbl
	add	x_tbl2, x_tbl1, x_vec, lsl #2
	add	x_tbl3, x_tbl2, x_vec, lsl #2
	add	x_tbl4, x_tbl3, x_vec, lsl #2
	mov	x_vec_i, #0

.align 4
.Lloop16_vects:
	ldr	x_ptr, [x_src, x_vec_i]
	add	x_vec_i, x_vec_i, #8
	ldr	q_data, [x_ptr, x_pos]

	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
	ldp	q_gft3_lo, q_gft3_hi, [x_tbl3], #32
	ldp	q_gft4_lo, q_gft4_hi, [x_tbl4], #32

	prfm	pldl1keep, [x_tbl1]
	prfm	pldl1keep, [x_tbl2]
	prfm	pldl1keep, [x_tbl3]
	prfm	pldl1keep, [x_tbl4]
	
	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
	ushr	v_data_hi.16b, v_data.16b, #4

	tbl	v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
	tbl	v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
	tbl	v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
	tbl	v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
	tbl	v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
	tbl	v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
	tbl	v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
	tbl	v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b

	eor	v_gft1_lo.16b, v_gft1_hi.16b, v_gft1_lo.16b
	eor	v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
	eor	v_gft2_lo.16b, v_gft2_hi.16b, v_gft2_lo.16b
	eor	v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
	eor	v_gft3_lo.16b, v_gft3_hi.16b, v_gft3_lo.16b
	eor	v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
	eor	v_gft4_lo.16b, v_gft4_hi.16b, v_gft4_lo.16b
	eor	v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b

	cmp	x_vec_i, x_vec
	bne	.Lloop16_vects

.align 4
.Lloop16_vects_end:
	str	q_p1_0, [x_dest1, x_pos]
	str	q_p2_0, [x_dest2, x_pos]
	str	q_p3_0, [x_dest3, x_pos]
	str	q_p4_0, [x_dest4, x_pos]
	add	x_pos, x_pos, #16
	cmp	x_pos, x_len
	ble	.Lloop16

.align 4
.Lloop16_end:
	sub	x_tmp, x_pos, x_len
	cmp	x_tmp, #16
	beq	.return_pass

.align 4
.lessthan16_init:
	mov	x_pos, x_len
	b	.Lloop16

.align 4
.return_pass:
	mov	w_ret, #0
	ret

.align 4
.return_fail:
	mov	w_ret, #1
	ret
